In [2]:
%matplotlib inline
In [3]:
import requests
from bs4 import BeautifulSoup
import pprint
def get_content(url):
""" Grab html content from url """
response = requests.get(url)
html = response.content
return html
def find_speakers(html):
""" Find the speakers and talk info in the html """
soup = BeautifulSoup(html)
speakers = soup.body.article.ul.findAll('li')
return speakers
Mother Jones
Featured at these sessions and panels:
How I learned to take command of the command line: A journalist's guide to getting started
In [4]:
def parse_speakers(speakers):
""" Parse speakers that were found in HTML """
data = {}
for row in speakers:
# Make sure we don't have any data left from previous loop
speaker = org = talks = None
# Grab speaker and org
speaker = row.find("h3").string
if speaker:
org = row.find('p').string
# Grab talk titles and talk links
talks = {}
talk_data = row.findAll('a', href=True)
for talk in talk_data:
link = 'http://ire.org' + talk['href']
title = talk.string
talks[title] = link
# Add everything to a dictionary
if speaker and talks:
data[speaker] = {
'org' : org,
'talks' : talks,
}
return data
In [47]:
# Sample output of 5 records
#import random
# select random keys from the dictionary
#random_keys = random.sample(data.keys(), 3)
# loop over keys and select
#for key in random_keys:
# print('\n', key)
# pprint.pprint(data[key])
In [48]:
# Process borrowed from here: https://github.com/scrapinghub/pycon-speakers/blob/432499e350098c69d4b3e0f641c960d927ec596d/pycon_speakers/pipelines.py
import sexmachine.detector as gender
def get_gender(name):
firstname = name.split()[0]
d = gender.Detector()
name_gender = d.get_gender(firstname)
return name_gender
def count_genders(names):
gender_count = {}
names = data.keys()
for name in names:
name_gender = get_gender(name)
try:
gender_count[name_gender] += 1
except KeyError:
gender_count[name_gender] = 1
print gender_count
return gender_count
In [49]:
urls = (
(2015, 'http://ire.org/conferences/nicar2015/speakers/'),
(2014, 'http://ire.org/conferences/nicar-2014/speakers/'),
(2013, 'http://ire.org/conferences/nicar-2013/speakers/'),
)
In [50]:
counts = {}
for url in urls:
html = get_content(url[1])
speakers = find_speakers(html)
data = parse_speakers(speakers)
count = count_genders(data)
counts[url[0]] = count
pprint.pprint(counts)
In [5]:
out = {2013: {u'andy': 10,
u'female': 35,
u'male': 98,
u'mostly_female': 2,
u'mostly_male': 4},
2014: {u'andy': 14,
u'female': 61,
u'male': 137,
u'mostly_female': 3,
u'mostly_male': 13},
2015: {u'andy': 24,
u'female': 69,
u'male': 143,
u'mostly_female': 4,
u'mostly_male': 12}}
In [6]:
timedata = []
for k,v in out.iteritems():
male = v['mostly_male'] + v['male']
female = v['mostly_female'] + v['female']
total = male + female
timedata.append((k, female*1.0/total, male*1.0/total))
print timedata
In [7]:
import pandas as pd
# Turn the years into the index.
# There is probably a better way to do this.
years = [i[0] for i in timedata]
values = [i[1:3] for i in timedata]
df = pd.DataFrame(values, index=years)
print df
In [8]:
import matplotlib.pyplot as plt
plt.figure()
df.plot()
plt.legend(loc='best')
plt.show()